In [45]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn import cross_validation
from sklearn.cross_validation import cross_val_score

In [46]:
df = pd.read_csv("data/Pay_Scale_data.csv", low_memory=False)

In [47]:
df.head()


Out[47]:
rank name Anual%_ROI_without_FA Anual%_ROI_with_FA 20yrnet_ROI_without_FA 20yrnet_ROI_with_FA Total_4yr_cost grad_rate years_to_grad loan_amnt %_GrantMoney_Received
0 - University of Phoenix - Phoenix, AZ(Private) NaN NaN NaN NaN NaN 16 6 33520 91
1 - Cleary University(Private) NaN NaN NaN NaN NaN 44 4 24360 70
2 - ITT Technical Institute - San Diego, CA(Private) NaN NaN NaN NaN NaN 27 6 34120 70
3 - Wilmington University(Private) NaN NaN NaN NaN NaN 39 4 12680 73
4 - Thomas Edison State College(In-State) NaN NaN NaN NaN NaN 0 NaN 0 0

In [48]:
df = df.dropna()

In [49]:
df.head()


Out[49]:
rank name Anual%_ROI_without_FA Anual%_ROI_with_FA 20yrnet_ROI_without_FA 20yrnet_ROI_with_FA Total_4yr_cost grad_rate years_to_grad loan_amnt %_GrantMoney_Received
5 1 Harvey Mudd College(Private) 8.7 12.6 985300 1104500 237700 91 4 21920 71
6 2 California Institute of Technology (Caltech)(P... 8.6 13.4 901400 1029700 221600 93 4 22160 54
7 3 Stevens Institute of Technology(Private) 8.1 11.5 841000 948300 232000 79 5 44000 97
8 4 Colorado School of Mines(In-State) 11.4 13.5 831000 866200 112000 70 5 30480 72
9 5 Babson College(Private) 8.0 12.8 812800 946500 230200 91 4 31880 45

In [50]:
df.describe()


Out[50]:
Total_4yr_cost grad_rate years_to_grad loan_amnt %_GrantMoney_Received
count 1217.000000 1217.000000 1217.000000 1217.000000 1217.000000
mean 130077.156943 56.564503 4.517666 27865.242399 77.032046
std 47645.786880 17.124524 0.516082 5603.649815 17.735255
min 46500.000000 11.000000 4.000000 9680.000000 25.000000
25% 89500.000000 43.000000 4.000000 23960.000000 63.000000
50% 121700.000000 55.000000 5.000000 27360.000000 78.000000
75% 163000.000000 69.000000 5.000000 31080.000000 94.000000
max 245000.000000 98.000000 6.000000 51600.000000 100.000000

Modelling


In [51]:
train_X = df[['Total_4yr_cost', 'grad_rate', 'years_to_grad', 'loan_amnt', '%_GrantMoney_Received']]

In [52]:
train_Y = df['Anual%_ROI_without_FA']

In [53]:
model = RandomForestRegressor(random_state=0, n_estimators=100, max_depth=50)

In [43]:
abs(np.mean(cross_val_score(model, train_X, train_Y, cv=5, scoring='mean_squared_error')))**0.5


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-43-162832098a69> in <module>()
----> 1 abs(np.mean(cross_val_score(model, train_X, train_Y, cv=5, scoring='mean_squared_error')))**0.5

/Users/nandini/anaconda/lib/python3.3/site-packages/sklearn/cross_validation.py in cross_val_score(estimator, X, y, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch)
   1441                                               train, test, verbose, None,
   1442                                               fit_params)
-> 1443                       for train, test in cv)
   1444     return np.array(scores)[:, 0]
   1445 

/Users/nandini/anaconda/lib/python3.3/site-packages/sklearn/externals/joblib/parallel.py in __call__(self, iterable)
    803             self._iterating = True
    804 
--> 805             while self.dispatch_one_batch(iterator):
    806                 pass
    807 

/Users/nandini/anaconda/lib/python3.3/site-packages/sklearn/externals/joblib/parallel.py in dispatch_one_batch(self, iterator)
    661                 return False
    662             else:
--> 663                 self._dispatch(tasks)
    664                 return True
    665 

/Users/nandini/anaconda/lib/python3.3/site-packages/sklearn/externals/joblib/parallel.py in _dispatch(self, batch)
    569 
    570         if self._pool is None:
--> 571             job = ImmediateComputeBatch(batch)
    572             self._jobs.append(job)
    573             self.n_dispatched_batches += 1

/Users/nandini/anaconda/lib/python3.3/site-packages/sklearn/externals/joblib/parallel.py in __init__(self, batch)
    182         # Don't delay the application, to avoid keeping the input
    183         # arguments in memory
--> 184         self.results = batch()
    185 
    186     def get(self):

/Users/nandini/anaconda/lib/python3.3/site-packages/sklearn/externals/joblib/parallel.py in __call__(self)
     71 
     72     def __call__(self):
---> 73         return [func(*args, **kwargs) for func, args, kwargs in self.items]
     74 
     75     def __len__(self):

/Users/nandini/anaconda/lib/python3.3/site-packages/sklearn/externals/joblib/parallel.py in <listcomp>(.0)
     71 
     72     def __call__(self):
---> 73         return [func(*args, **kwargs) for func, args, kwargs in self.items]
     74 
     75     def __len__(self):

/Users/nandini/anaconda/lib/python3.3/site-packages/sklearn/cross_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, error_score)
   1539             estimator.fit(X_train, **fit_params)
   1540         else:
-> 1541             estimator.fit(X_train, y_train, **fit_params)
   1542 
   1543     except Exception as e:

/Users/nandini/anaconda/lib/python3.3/site-packages/sklearn/ensemble/forest.py in fit(self, X, y, sample_weight)
    236 
    237         if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous:
--> 238             y = np.ascontiguousarray(y, dtype=DOUBLE)
    239 
    240         if expanded_class_weight is not None:

/Users/nandini/anaconda/lib/python3.3/site-packages/numpy/core/numeric.py in ascontiguousarray(a, dtype)
    548 
    549     """
--> 550     return array(a, dtype, copy=False, order='C', ndmin=1)
    551 
    552 def asfortranarray(a, dtype=None):

ValueError: could not convert string to float: '<-15.0'

In [ ]: